What I learned from 



Excelsior JET 
V8 

Dart VM 
Lua IT 






torch 


«A SCIENTIFIC COMPUTING FRAMEWORK FOR LUAJIT» 



deep i nternal i nsight 
overview of interesting things 


local p={x = l, y = l 
for i = 1, 100 do 
p={x=p.x+i, 

y = p.y - i > 

end 


} 




whirlwind 
ntroduction to Lua 


j/zra 


— dynamically typed 
local v 
v = 1 

v = "string" 
v = true 

v = { } — table 
v = function () end 


wire 


— tables are key-value 

— key is any type 
local p = { 

x = 1, 

y = i, 

} 


cli cti onari es 


F/ira 


— tables are key-value 

— key is any type 
local p = { 

['x'] = 1, 

Cy’] = 1, 

[ 222 ] = 1 , 

[{ }] = 1 


} 


di cti onari es 


slits 


— single numeric type: 

— double precision floating point 
type(l) — 'number' 

type(l.O) — 'number' 
type (1.1) — 'number' 


sfzra 


— metatables alter behavior of tables 
local tbl = {} 
setmetatable (tbl , { 

index = function (self, key) 

pri nt( ' i ndex 1 , key) 
return G 
end, 

newindex = function (self, key, val) 

pri nt( ' newi ndex ' , key, val) 
end 


— metatables alter behavior of tables 
pri nt (tbl [ ' somekey ' ] ) 

— index somekey 

— G 

tbl [ 42 ] = 'somevalue 1 ; 

— newindex 42 somevalue 


TlftW 


local tbl = {} 
setmetatable (tbl , { 
index = { x = 42 

}) 

print(tbl.x) — 42 


irJlTS 


— metatables alter behavior of tables 
setmetatable (tbl , { 

— will be called when evaluating 

— + expression with tbl 
add = function () 

■ * K 

end 

}) 




local p={x = l, y = l 
for i = 1, 100 do 
p={x=p.x+i, 

y = p.y - i > 


end 


} 




->LOOP: 

xorps xmm5, xmm5 
cvtsi2sd xmm5, ebp 
addsd xmm6, xmm5 
subsd xmm7, xmm5 
add ebp, +0x01 
cmp ebp, +0x64 
jle ->LOOP 
jmp ->4 


lijltS 


« how does it 

do it? » 

learning by reading sources 


iwira 


local p={x=l, y=l 
for i=l, 100 do 
p={x=p.x+i, 

y = P*Y " 

[1] = p[l] } 


end 



vfm 


-> LOOP : 

movsd [rsp+0x2S], XHW6 
movsd [rsp+0x30] f juiirti? 
mov [rsp+0x24] , eax 
mov edij [0x0G0423dS] 
cmp edi, [0xOO0423dc] 
jb skip 
mov esi, 0x1 
mov edij 0x000423b3 
call -^lj_gc_step_jit 
test eax , eax 
jnz ->4 
skip: 

mov edij [0x0Q0424b0] 

mov esij 0X00G5294B 

call ->lj_tab_dup 

mov esij, eax 

mov [rsp+0x20] j esi 

mov edij [0xOG0424b@] 

mov eaKj [rsp+0x24] 

movsd xmm7, [rsp+0x30] 

movsd xmm5> [rsp+0x28] 

cmp dword [rax+0xic] , +0x01 

jins ->4 

mov rlSdj [rax+0xl4] 
mov rbx, OxfffffffbOOOSSeSO 
cmp rbxj [r 15 +0x2-0] 
jnz ->4 


xorps xmmS, xmm& 
cvtsi2sd xmnn6j ebp 
addsd xmn>5 , xmm& 
movsd [rsp+GxlS], xmmS 
mov ebx,, [rsi+0xl4] 
movsd [rtox+OxllS] j xnwnS 
mov rdx, CxfffffffbC 004a IBS 
cmp rdx, [rl 5+0x3] 
jnz ->5 

subsd xmm7j xmm& 
movsd I[rsp+0xl3]j xmmT 
movsd [rbx], xmm7 
cmp dword [ rax +0x18 } f +0x01 
jbe ->6 

mov ebx, [rax +0x3] 

cmp dword [rbx+0x,c] J Sxfffeffff 

job 

movsd xmmSj [rbx +0x8] 
movsd [rsp+0x8]j xnimS 
mov edx, CixOOQSSSdB 
call ->lj_tab_newkey 
mov ebx, eax 
mov eax, [rsp+0x20] 
movsd xmm?, [rsp+OxlS] 
mov sd x mmS } [ r s p + 0x 10] 
movsd xmniSj [rsp+0x8] 
movsd [rbx], xmm5 
add ebp, +0x01 
cmp ebp, +0x64 
jle -> LOOP 
jmp ->? 


TiJZW 




pj.iih 


« why does it 
not do it? » 

learning by fixing bugs 


KUI1S 


1GB memory limit 

(pre v2.l) 


in j us 


Lua is dynamically 


NaN-tagging 


Z2JZ19 


sign mantissa (52 bit) 

v / \ 

\ / 

exponent (11 bit) 





sign mantissa (52 bit) 

v / \ 


\ / 

exponent (11 bit) 
NaN: E = 7ff & M * 0 


Zhira 



mantissa (52 bit) 


sign 

v / 

\ / 

exponent (11 bit) 

NaN: E = 7ff & M ^ 0 (whole 


\ 

(64 bits) 


family of NaNs) 



TValue 

dynamically typed slot 


BJ'ZW 


MSW 


LSW 


\/ 

double 


\ 64 bit 
number 
gc obj 


zsjzra 







/ MSW \/ LSW \ 64 bit 


double 



number 
gc obj 


number tag < ffffG000 
table tag = fffffff4 = ~llu 









\1. .l/\/\ 47 bits 


tag (4 bits) 


\ 64 bit 
number 
gc64 obj 


30JZT5 





kinda works 

AArch64: 52-bit VA 


JlJZW 


changing tagging 
tough exercise 




// Macros to test operand types, 
.macro checktp, reg, tp 
cmp dword [BA$E+reg*8+4] , tp 
.endmatro 

.macro checktab, reg, target 
checktp reg, LJJTTAB 
jne target 
.endmacro 


case BC_TGETB : 

ins_ABC II RA = dst, RB = 
checktab RB, ~>vmeta_tgetb 
mov TAB: RB, [BASE+RB*3] 


table, RC = byte literal 




DynASM 

generates code that 
generates code 




case BC TGETB: 


// 

// 

// 


ins_ABC // RA = dst, RB = table, RC = byte literal 
checktab RB, ->vmeta_tgetb 
mov TAB: RB, [BASE+RB*8] 


dasm_put(Dst, 10994, U_TTAB, Dt6(->asize) , Dt6(->array ) , LZI_TNIL, 


Ejzra 


// Type definitions. Some of these are only used for documentation, 
.type Lj lua_$tate 

.type GLj global_$tate 

* * 

mov GL : RB, L:RB”>gIref 

mov dword GL: RB->vmstate , ~L3_VMST_C 




// Type definitions. Some of these are only used for documentation, 
.type Lj lua_$tate 

.type GLj global_$tate 

mov GL:RB, [RB, #offsetof (lua_State j glref)] 
mov dword GL:RB->vmstate, ~LCLVMST_C 


Wfi H 


no actual understanding of types 


38JZ19 


cmp dword L: RB->openupval , 0 


cmp dword 


L: RB->openupval 

AAAAAAAAAAAAAAA 


0 

poi nter 


waits 


cmp aword L: RB->openupva! , 0 


what is interpreter 
interpreting? 


u-ifrra 


/ 32 bits \ 

H 1 h 1 1" 

| OP I I I I 

H H h -4 h 

| OP I I | 

0 32 


Format 


Format 


ABC 

AD 


*3j'ZW 


BASE 

4 ' 

— 1 <^9 I „ || _ ^9- ■- J I I . i I l_ | - , I I I ^9 I I I J . ,| 1 1 '^9' '1 I U_ | 

| R0 | R1 | R2 | 


TValue 


(64bit) 


+ 


'V 



+ 




wits 


CALL A, ResN, ArgN 

F <- RCA) ; 

RCA), R(A+ResN>-2) <- F(R(A+1), . R(A+ArgN-l) ) , -if ResN I- 0 

R(A) j <- F CR C A+l ) , . ,* s R(A+Argh-l}), if ResN =- 0 




BASE 

nU 








| Func | Arg0 


R (A) 







■r+J 


BASE 

nU 

/Hj — + — 

| | Func 


BASE 

I Arg0 

■T 

R (0) 


I Argl | 


WJ'ItS 


frame linking 


OKftTS 


BASE 

4 - 


BASE 


| | Func | Arg0 

/ \ 

/ \ 

[ tag | ptr ] 



win 


BASE 

4 - 


BASE 


| | Func | Arg0 

/ \ 

/ \ 

[ link j ptr ] 



Mjzra 


link 


PC 

00 

'T 

1 

Lua frame 

delta 

001 

1 

C frame 

delta 

010 

1 

Continuation frame 

delta 

011 

1 

Lua vararg frame 

delta 

101 

i 

cpcall() frame 


etc . . . 


PC is 4 byte aligned 
delta is 8 byte aligned 




link 


H — — 

PC 00 | Lua frame 

delta 001 | C frame 
delta 010 | Continuation frame 
delta 011 | Lua vararg frame 
delta 101 | cpcallQ frame 

. > > . etc > i . 

PC is 4 byte aligned 
delta is 8 byte aligned 


isftti 


when unwinding look at PC-1 to determine 

caller's BASE 

CALL A, ... => CallerBASE = BASE - A 




link 


H — — 

PC 00 | Lua frame 
delta 001 j C frame 
delta 010 j Continuation frame 

delta 011 | Lua vararg frame 
delta 101 | cpcall() frame 

. > > . etc > i . 

PC is 4 byte aligned 
delta is 8 byte aligned 


jtJIW 


continuations allow to specify action to 
perform when callee returns 


;; jump to target if R(A) 

ISEQV A, D 
3UMP target 


R(D) 




;; jump to target if R(A) == R(D) 

ISEQV A, D 
3UMP target 

;; what if R(A) has eq metamethod? 


;; jump to target if R(A) == R(D) 

ISEQV A, D 
3UMP target 

;; what if R(A) has eq metamethod? 

;; need to call metamethod 
;; ... then branch on return 




;; jump to target if R(A) == R(D) 

ISEQV A, D 
3UMP target 

;; what if R(A) has eq metamethod? 

;; need to call metamethod 
;; ... then branch on return 




i nterpreter 
+ 

| * ® * 

j PC -> ISEQV A, D 

j 3UMP target 


waits 


i nterpreter 

+ 

| -|~ ■ h 

| | nested interpreter | 

I I for the metamethod I 


r/uira 


i nterpreter 
+ 

| * ® * 

j PC -> ISEQV A, D 

j 3UMP target 


branch on the result from 
the nested interpreter 




continuations make it 

simpler 


MJIfl 


BASE metamethod 

•j, / — frame — > 

\ / continuation callback 

current frame (e.g. cont_condt) 


(WITS 


let's talk about 
DISPATCH 


MJira 


jmp aword [DISPATCH+0P*4] 


jmp aword [DISPATCH+0P*4] 

can replace handlers 




profiling 

recording 





;; hotcounting 
; ; loop bytecodes 

FORL 

ITERL 

LOOP 


;; function entries 

FlINCF 


MjZT9 


.macro hotloop, reg 
mov reg, PC 
shr reg, 1 

and reg, HOTCOUNT_PCMASK 

sub word [DISPATCH+reg+GG_DISP2HOT] 

HOTCOUNT_LOOP 

jb ->vm_hotloop 
.endmacro 


hotcount [ (PC»2) & (H0TC0UNT_SIZE-1) ] 


nfts 


#define HOTCOUNT_SIZE 64 

hotcount [ (PC>>2) & (H0TC0UNT_SIZE-1) ] 


m'ira 


#define HOTCOUNT_SIZE 64 

hotcount [ (PC>>2) & (H0TC0UNT_SIZE-1) ] 

/* can cause non-determenism */ 




recording pipeline 




tracing 101 


nJlw 


JWZTS 




rajzra 



jsfizra 



BCUi™ 



tn/JW 



tKJzra 


BMW 


aura 



uuira 








usjzra 



saizra 



KUZW 


51/2W 



m'ira 



wjzra 



MJltS 






Wi'ZW 


______ 

guard- A A A A 


hot side exits spawn side traces 


back to record 


concrete 

values 


INTERPRETER 

.+ 

~_ 4 „_ + ___ 4 _„ 4 ^« 

i i i i i i 

+ “' b 4 *“' +-~ 

II# 

> ADDW r0 ? r0 ? rl === 


RECORDER 

— 4 — — 4 - 

I I I 

"'“4“^— 4 j 


SSA 

refs 

-v- *+ 

— 4— 4— 

“4“' 4“™ 

* * f 

> SSA IR 

a * ■ 

■ — 4 


100)119 


concrete 

values 


INTERPRETER 



| num| num| 

™-+ + + +-~ 


i » i 


> ADDW r0 ? 



+ 


+ 


SSA 

refs 

RECORDER | 



~ i — I — + r — 


001 002 


+ +. — -+- 


* * * 


===========> SSA IR 


+ 


+ 


+ 


Tji.'ifl 


concrete 

values 

INTERPRETER | 




| num|num| 

+ 4 H ■^+^ ,v 


ADDVV r0 , 

> SUBVN rl > 


rO j 

rl, 



+ 


+ 


SSA 

refs 

RECORDER | 

+--- ---v- 

~ — i — I — + r — f 


003)002 

"'-■t — + +- ~ 

003: ADD 001 3 002 
==========> SSA IR 


+ 


+ 


+ 


Kaftr. 


concrete 

values 


INTERPRETER 

| nun | num| 

~-+ + + 

subvn ri, n, 


-+-~ 

i 

-+ — 
+l 


SSA 

refs 

RECORDER | 

+ v . 

~ i — I r ’ — + r — f 


003 j 004 

- + + + +- 


004: SUB 002 


+1 


=========> SSA IR 




+ 


+ 


HBJZfl 



IMJItS 


/* Trace object- */ 
typedef struct GCtrace { 

/* IR instructions/constants. 
** Biased with REF^BIAS. 

*/ 

IRIns *ir; 


} GCtrace; 


1 arm 


/* Trace object- */ 
typedef struct GCtrace { 

/* IR instructions/constants* 

** Biased with REF_BIAS. 

*/ 

IRIns *ir; 

} GCtrace; 


m'zts 


typedef inntl6_t IRRefl; 


/* Fixed references. */ 

enuim { 

REF_TRUE = REF„BIAS-3 y 
REF_FALSE = REF„BIAS-2 J 
REF_I\!IL = REF_BIAS-1 , 

/* \ Constants grow downwards 

REF_BIAS = 0x8000 9 

/* / ir grows upwards* */ 

REF_FIRST = REF_BIAS^X ? 

REF DROP = 0xffff 


KSftH 


< — constants — \ / — non-constants — > 

| false [true | nil | 

+ + — — -+ 

A &i r [REF_BIAS] 

ir := irbuf + nconsts - REF BIAS 


TO8J1W 


IRIns 

16 16 8 8 
| opl | op2 | t | o | 
| opl2/i/gco | ot | 

16 


32 


rev 





I opl | op2 I t I o 
| opl2/i/gco | ot 


I r I s I 

H + — — h 

| prev | 


prev is the reference to the previous 
instruction with the same opcode 


| opl | op2 | t | o | r | s | 

| opl2/i/gco | ot | prev | 

r/s register allocation state 




I opl | op2 I t I o I 
| opl2/i/gco | ot | 


o opcode 
t type 


rev j 




| opl | op2 | t | o I 

| opl2/i/gco | ot | 


opl/op2 IR references 


rev j 


ro.iiw 


I opl 

I opl2 / 
_| 


Op2 | t | O 

/geo I ot 

— — — ' H — 


i/gco constants (32 bit) 


prev 


+ 


+ 


liwzra 


/* Tagged IR references (32 bit). 


** 

** 

- 

- + 

-+ 



** 

| i rt 

1 flags 

| 

ref 

| 

** 

+ 

-+ 

- + 



** 

** 

The tag 

holds a 

copy 

of the 

IRType 


** and speeds up IR type checks. 
*/ 

typedef uint32_t TRef; 




BYTECODE 


> SSA IR 


nw'iw 


BYTECODE ===================> SSA IR 

I A 

v I 

interpret -> specialize -> fold&emit 




case BC_IEN: 

if (tref_isstr(rc) ) 

rc = emi ti r (IRTI (IR_FLQAD) ? rc, IRFL_STR_LEN) ; 
else if (!LJ_S2 && tref_i stab ( rc) ) 

rc = lj_i r_call(3 , IRCALL_lj_tab_len, rc) ; 
else 

rc = rec_mm_len(3 , rc, rev); 
break; 




case BC_LEN: 

i f (tref_isstr(rc) ) 

rc = emi ti r (IRTI (XR_FLQAD) ? rc, IRFL_STR_LEN) ; 
else if (!LJ_S2 && tref_i stab ( rc) ) 

rc = lj_i recall (3 , IRCALL_lj_tab_len, rc) ; 
else 

rc = rec_im_len( 3 , rc, rev); 
break; 


rai'ZM 


case BC_IEN: 

if (tref_isstr(rc) ) 

rc = emi ti r (IRTI (IR_FLOAD ) f rc ? IRFL_STR_LEN) ; 
else if ( ! L J_52 && tref_i stab ( rc) ) 

rc = lj_i recall (3 , IRCALL_lj_tab_len, rc) ; 
else 

rc = rec_mm_len(J , rc, rev); 
break; 


THUS? 


emiti r passes instruction 
to FOLD engine 


izuira 


LJFOLD(FLOAD SNEW IRFL_STR_LEN) 
LJFOLDF (fload_str_len_snew) 

{ 

/* Return length passed to SNEW. 
return fleft->op2; 


} 


LJFOLD(FLOAD SNEW IRFL_STR_LEN) 
LJFOLDF (fload_str_len_snew) 

{ 

/* Return length passed to SNEW. */ 
return fleft->op2; 

} 

// Rules hashtable generated by build 
// Rules applied until fixpoint 


lEJ.ii™ 


FWD 

DSE 

NARROW 

ABCelim 


mjira 


DCE 

LOOP 

SPLIT 

SINK 


TfijZffl 


DCE 

LOOP 

SPLIT 

SINK 


■uwzra 


local sum = 0 
for i = 1 , n do 
sum = sum + arr[i] 
end 


IDJIT9 


0006 

TGETV 

GO 

0007 

ADDVV 

r3, 

0008 

FORL 

r4 


r 1, r7 
r3 , r8 
— > 0006 


^25JZ?9 


0006 

TGETV 

00 

0007 

ADDVV 

r3, 

0008 

FORL 

r4 


r 1, r7 
r3 , r8 
=> 00G6 


r8 = rl[r7] 
r3 = r3 + r8 
r4 = r4 + r6 
if r 4 <= r5 then 
r7 = r4 
jump 0006 
end 




arr 


sum 


0006 

0007 

0008 


[ - 

TGETV 

ADDVV 

FORI 


R0 R! R2 R3 


r8, r 1 
r3 t r3 
r4 => 


, r7 


» r8 | 
0006 


fij lim step i 

R4 R5 R6 R7 R8 


J 


uaizi* 


arr 


R0 


[ 


^ 0005 

FORI 

r 4 = 

0006 

TGETV 

r8, 

0007 

ADDVV 

r3, 

0008 

FORL 

r4 = 


R1 


> 0009 

rl f rl 
r3 , r 8 

> 0006 


sum lim step i 

R3 R4 R5 R6 R7 R8 


] 


m'ira 


arr 


R0 


[ 


0005 

FORI 

r 4 = 

0006 

TGETV 

r8, 

0007 

ADDVV 

r3, 

0008 

FORL 

r4 = 


R1 


> 0009 

r 1, r7 
r3 , r 8 

> 0006 


sum (i) lim step i 


R2 R3 R4 R5 
0003 0001 

0001 SLQAD R5 

0002 LE 0001 

0003 SLOAD R4 


R6 R7 

0003 

+2147483646 


arr sum (i) lim step i 




R0 R1 

R2 

R3 R4 

R5 


[ -- 

— , ____ 0004 


0003 0001 

0005 

FORI 

r4 => 0009 

0001 

SLOAD 

R5 

=> 0006 

TGETV 

r8 , rlj r7 

0002 

LE 

0001 

0007 

ADDVV 

r3 y r3> r8 

j 0003 

SLOAD 

R4 

0008 

FGRL 

r4 => 0006 

0004 

SLOAD 

R1 


R6 R7 
0003 

+2147483646 




R0 

orr 

R1 

R2 

sum (i) 
R3 R4 

lim 

R5 


[ -- 


0004 


0003 0001 

0005 

FORI 

M 

V 

0009 

0001 

SLOAD 

R5 

=> 0006 

TGETV 

r8 , r 1 

, r 7 

0002 

LE 

0001 

0007 

ADDVV 

r3 j r3 

? r8 

j 0003 

SLOAD 

R4 

0008 

FGRL 

ll 

V 

0006 

0004 

0005 

SLOAD 

FLOAD 

R1 

0004 


step i 
R6 R7 
0003 

+2147483646 


tab.asize 





orr 


sum (i) 

lim 

step i 



R0 

R1 

R2 

R3 R4 

R5 

R6 R7 


[ -- 


0004 


0003 0001 

0003 

0005 

FORI 

r4 => 

0009 

0001 

SLOAD 

R5 


=> 0006 

TGETV 

r8 , r 1 

, r 7 

0002 

LE 

0001 

+2147483646 

0007 

ADDVV 

r3 j r3 

? r8 

j 0003 

SLOAD 

R4 


0008 

FGRL 

r4 => 

0006 

0004 

SLOAD 

R1 






0005 

FLOAD 

0004 

tab *asize 





0006 

ABC 

0005 

0001 




arr 

sum fi) 

lim 

step i 



R0 R1 

R2 

R3 R4 

R5 

R6 R7 


[ -- 

0004 


0003 0001 

0003 

0005 

FORI 

r4 => 0009 

0001 

SLOAD 

R5 


=> 0006 

TGETV 

r8 , rl 3 r? 

0002 

LE 

0001 

+214748364 

0007 

ADDVV 

r3 y r3> r8 

j 0003 

SLOAD 

R4 


0008 

FGRL 

r4 => 0006 

0004 

SLOAD 

R1 





0005 

FLOAD 

0004 

tab *asize 




0006 

ABC 

0005 

0001 




0007 

FLOAD 

0004 

tab.arrav 





orr 

sum fi) 

lim 

step i 



R0 

R1 

R2 

R3 R4 

R5 

R6 R7 


[ -- 


0004 


0003 0001 

0003 

0005 

FORI 

r4 => 

0009 

0001 

SLOAD 

R5 


=> 0006 

TGETV 

r8 , r 1 

, r 7 

0002 

LE 

0001 

+214748364 

0007 

ADDVV 

r3 j r3 

, r8 

j 0003 

SLOAD 

R4 


0008 

FORL 

r4 => 

0006 

0004 

SLOAD 

R1 






0005 

FLOAD 

0004 

tab *asize 





0006 

ABC 

0005 

0001 





0007 

FLOAD 

0004 

tab. array 





0008 

AREF 

0007 

0003 


arr 


sum (i) lim step i 

R0 R1 R2 R3 R4 R5 R6 R7 R8 

[ 0004 0003 0001 0003 0009 ] 


0005 

FORI 

r4 => 0009 

0001 

SLOAD 

R5 


=> 0006 

TGETV 

r8 , rlj r? 

0002 

LE 

0001 

+2147483646 

0007 

ADDVV 

r3 y r3 f r8 

j 0003 

SLOAD 

R4 


0008 

FORL 

r4 = > 00G6 

0004 

SLOAD 

R1 





0005 

FLOAD 

0004 

tab , asize 




0006 

ABC 

0005 

0001 




0007 

FLOAD 

0004 

tab. array 




0008 

AREF 

0007 

0003 




0009 

ALOAD 

0008 










milts 


arr 


sum (i) lim step i 
R0 R1 R2 R3 R4 R5 R6 R7 R 8 

[ 0004 ---- — 0003 0001 0003 0009 ] 


0005 

FORI 

r4 => 0009 

0001 

SLOAD 

R5 


0006 

TGETV 

rS, rlj r? 

0002 

LE 

0Q01 

+2147483646 

0007 

ADDVV 

r3 f r3> r8 

j 0003 

SLOAD 

R4 


0008 

FORL 

r4 => 0006 

0004 

SLOAD 

R1 





0005 

FLOAD 

0004 

tab*asize 




0006 

ABC 

0005 

0001 




0007 

FLOAD 

0004 

tab.array 




0008 

AREF 

0007 

0003 




0009 

ALOAD 

0008 



arr 


R0 




0005 

FORI 

r 4 = 

0006 

TGETV 

r8, 

0007 

ADDVV 

r3, 
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0008 


0010 

SLOAD 

#4 

T 

0011 

ADD 

0010 

0009 

0012 

ADD 

0003 

+ 1 

0013 

LE 

0012 

0Q01 

» 4 * « 

SNAP 

[ — 


* * * * 

SNAP 

[ — 




0001 

0002 

0012 

0004 

0005 

0006 
0007 


0015 AREF 

0007 

0012 


0016 ALOAD 

0015 



0011 




0017 ADD 

0011 

0016 


0018 ADD 

0012 

+ 1 


0019 LE 

0018 

0001 


0011 0012 

0001 - 

0012 

] 

0017 0018 

0001 - 

— 0018 

] 


in /S b 


0001 

5 LOAD 

#6 

CRI 

0002 

LE 

0001 

+2147483646 

0003 

SLOAD 

#5 

Cl 

0004 

SLOAD 

#2 

T 

0005 

FLOAD 

0004 

tab.asi ze 

0006 

ABC 

0005 

0001 

0007 

F LOAD 

0004 

tab.array 

0008 

AREF 

0007 

0003 

0009 

A LOAD 

0008 


0010 

SLOAD 

#4 

T 

0011 

ADD 

0010 

0009 

0012 

ADD 

0003 

+1 

0013 

LE 

0012 

0001 


0001 

0002 

0012 

0004 

0005 

0006 
0007 


0015 

AREF 

0007 

0012 

0016 

ALOAD 

0015 


0011 

0017 

ADD 

0011 

0016 

0018 

ADD 

0012 

+ 1 

0019 

LE 

0018 

0001 

00 20 

PHI 

0012 

0018 

0021 

PHI 

0011 

0017 




LJFOLD(FLOAD SNEW IRFL_STR_LEN) 
LJFOLDF (fload_str_len_snew) 

{ 

/* Return length passed to SNEW. 
return fleft->op2; 


} 


LJFOLD(FLOAD SNEW IRFL_STR_LEN) 

LJFOLDF (fload_str_len_snew) 

{ 

/* Return length passed to SNEW. */ 

/* What if fleft is not invariant? */ 
return fleft->op2; 

} 


RJltS 


LJFOLD(FLOAD SNEW IRFL_STR_LEN) 
LJFOLDF (fload_str_len_snew) 

{ 

/* Return length passed to SNEW. 
return fleft->op2; 

} 


LJFOLD(FLOAD SNEW IRFL_STR_LEN) 
LJFOLDF (fload_str_len_snew) 

{ 

/* Return length passed to SNEW. 
return fleft->op2; 

} 


DCE 

LOOP 

SPLIT 

SINK 


mftri 


mble 


asm_guardcc (as , CC_E) ; 

emit_rr (as, XO_TEST, RID_RET, RID_RET) ; 


asm_guardcc (as , CC_E) ; 
emit_rr (as, XO_TEST, RID 
/* looks a bit strange? 


RET, RID_RET) ; 

*/ 


asm_guardcc (as , CC_E) ; 
emit_rr (as, XO_TEST, RID 
/* assembled backwards! 
/* test rax, rax; je ... 


RET, RID_RET) ; 

*/ 

*/ 


Tffl.'its 


near scan 


THE END 


■suits 


wwzra 


tab.fld 




0003 int FLOAD 0002 tab.hmask 

0004 int EQ 0003 XXXX 

0005 p32 FLOAD 0002 tab. node 

0006 p32 HREFK 0005 "fid" @YYYY 

0007 num HLOAD 0006 


cmp dword [rdx+Oxlc] , XXXX 
jnz ->0 

mov ecx, [rdx+0xl4] ; tab. node 
mov rdi , 0xfffffffb00052de0 ; "fid" 
cmp rdi, [rcx+YYYY] 
jnz ->0 

lea eax, [rcx+0xl8] 

cmp dword [rax+0x4] , 0xfffeffff 

jnb ->0 ; is num? 







local M = {} 
function M:getFld() 
return self. fid 
end 

local s = setmetatable({fld = 1} 

{ i ndex 

local sum = 0 
for i = 0, 100 do 

sum = sum + s:getFld() 
end 


0003 


p32 

HREF 

0002 

■fgetFld" 

0004 

> 

p32 

EQ 

0003 

[0X00042458] 

0005 


tab 

FLOAD 

0002 

tab.meta 

0006 

> 

tab 

NE 

0005 

NULL 

0007 


int 

FLOAD 

0005 

tab, hmask 

0008 

> 

int 

EQ 

0007 

+ 1 

0009 


p32 

FLOAD 

0005 

tab. node 

0010 

> 

p32 

HREFK 

0009 

IT index 1 ' @1 

0011 

> 

tab 

HLOAD 

0010 


0012 


i nt 

FLOAD 

0011 

tab, hmask 

0013 

> 

int 

EQ 

0012 

+ 1 

0014 


p32 

FLOAD 

0011 

tab. node 

0015 

> 

p32 

HREFK 

0014 

M getFld TI @0 

0016 

> 

fun 

HLOAD 

0015 


0017 

> 

fun 

EQ 

0016 

y . lua;4 


fid load here 


zu'zre 


r f r 


* r t 


0003 


p32 

HREF 

0002 

"getFld" 

0004 

> 

p32 

EQ 

0003 

[0X00042458] 

0005 


tab 

FLOAD 

0002 

tab.meta 

0006 

> 

tab 

NE 

0005 

NULL 

0007 


int 

FLOAD 

0005 

tab, hmask 

0008 

> 

int 

EQ 

0007 

+ 1 

0009 


p32 

FLOAD 

0005 

tab. node 

0010 

> 

p32 

HREFK 

0009 

IT index 1 ' @1 

0011 

> 

tab 

HLOAD 

0010 


0012 


i nt 

FLOAD 

0011 

tab, hmask 

0013 

> 

int 

EQ 

0012 

+ 1 

0014 


p32 

FLOAD 

0011 

tab. node 

0015 

> 

p32 

HREFK 

0014 

M getFld TI @0 

0016 

> 

fun 

HLOAD 

0015 


0017 

> 

fun 

EQ 

0016 

y . lua;4 

* m r 

fid 

load here 

* * * 



mins 


0003 


p32 

HREF 

0002 

■fgetFld" 

0004 

> 

p32 

EQ 

0003 

[0X00042458] 

0005 


tab 

FLOAD 

0002 

fab*meta 

0006 

> 

tab 

NE 

0005 

NULL 

0007 


int 

FLOAD 

0005 

tab.hmask 

0008 

> 

int 

EQ 

0007 

+1 

0009 


p32 

FLOAD 

0005 

tab* node 

0010 

> 

p32 

HftEFK 

0009 

"—Index" @1 

0011 

> 

tab 

HLOAD 

0010 


0012 


i nt 

FLOAD 

0011 

tab.hmask 

0013 

> 

int 

EQ 

0012 

+ 1 

0014 


p32 

FLOAD 

0011 

tab. node 

0015 

> 

p32 

HREFK 

0014 

M getFld TI @8 

0016 

> 

fun 

HLOAD 

0015 


0017 

> 

fun 

EQ 

0016 

y . lua:4 

* i * 

fid 

load here 

* * * 



■rolra 


0003 


p32 

HREF 

0002 

■fgetFld" 

0004 

> 

p32 

EQ 

0003 

[0X00042458] 

0005 


tab 

FLOAD 

0002 

tab.meta 

0006 

> 

tab 

NE 

0005 

NULL 

0007 


int 

FLOAD 

0005 

tab, hmask 

0008 

> 

int 

EQ 

0007 

+ 1 

0009 


p32 

FLOAD 

0005 

tab. node 

0010 

> 

p32 

HREFK 

0009 

" index 1 ' @1 

0011 

> 

tab 

HLOAD 

0010 


0012 


int 

FLOAD 

0011 

tab.hmask 

0013 

> 

int 

EQ 

0012 

+1 

0014 


p32 

FLOAD 

0011 

tab* node 

0015 

> 

p32 

HREFK 

0014 

"getFld 11 @0 

0016 

> 

fun 

HLOAD 

0015 


0017 

> 

fun 

EQ 

0016 

y . lua;4 


fid load here 


t m * 


* + * 


problematic if not 

invariant 


wJlra 


traces are not 
reentrant 

[can't call lua_CFunctiGn&stay on trace] 

[though LJ2.1 has stitching] 




local str = "abed" 
local sum = 0 
for i = 0, 100 do 
str = str : gsub ( ' 

: gsub ( ' 


end 


C func 
C func 


gsub 

X 


* 

* gsub 

. * X 


* — + 


WtftB 


gsub 

X 


* 

* gsub 

. * X 


* — + 

state transfer via • 
interpreter state • 




builtin library? 


T5WZW 


builtin library? 

[need to record manually] 

[LJ2.1 has LJLIBUUA] 


206ft19 


LHlXB_ILUA(tabte_ remove) /* 
function{t 5 pos) 
CHECK,tab(t) 
local len - #t 
if pos == nil then 
if len ~= 0 then 
local old = t[len] 
t[len] = nil 
return old 
end 
else 

4 4 4 

end 

end 

*/ 


xn/zw 



Ustftti 


ffi.cdef [[ 

typedef struct { int32_t x, y; } S; 
double f(S* p, size_t n); 

]] 

local S = ffi . typeof ( ' S ' ) 

local arr = ffi . new( 1 S[?] ' , 2) 
arr [0] = S(l, 2) 
arr [1] = S(3, 4} 
ffi . C.f (arr , 2) 


zraJira 


ffi objects have 
frozen metatables 

[see issue #41 for normal tables] 


ffi . cdef [ [ 

typedef struct { int32_t x, y; } S; 

]] 

local M = {} 

function M:getX() return self.x end 

local S = ffi .metatype( ' S ' , { index=M}) 

local s = S(l } 2) 

local sum = 0 
for i = 0, 100 do 
sum = sum + s:getX() 
end 


zuvira 


0003 

ul6 

FLOAD 

0002 

0004 > 

int 

EQ 

0003 

0005 

p64 ADD 

0002 

0006 

i nt 

XLOAD 

0005 


no table probing! 


cdata . ctypeid 

+XXXX 

+YYYY 


2t)WZt9 


side-traces 




side-traces 

[not all values are carried inside] 

[rejoins at the trace entry] 



one more thing 


local function faster(arr 5 n) 
local sum = 8 
for i = 1, n do 
sum = sum + arr[i] 
end 

return sum 

end 

local function slower (arr 5 n) 
local sum* i = G > 1 
while i <= n do 

sum - sum + arr[i] 

«■ * 

i=i+l 

end 

return sum 
end 


nai'zts 


What I learned from 

Lua 


zn/zw 


ELEGANCE ISA 

DOUBLE-EDGED 

SWORD 



DO NOT FEAR 

THE PREPROCESSING 



USERS DON'T 
UNDERSTAND 

WHAT IS FAST 



PERFORMANCE 
IMPLICATIONS OF 
TRACING ARE 

NONTRIVIAL 



SEARCH FOR 

THE BALANCE 



MAKE YOUR 

OWN RULES 



THANK YOU! 



